!pip install plotly
Requirement already satisfied: plotly in c:\users\mainuddin\anaconda3\lib\site-packages (5.9.0) Requirement already satisfied: tenacity>=6.2.0 in c:\users\mainuddin\anaconda3\lib\site-packages (from plotly) (8.0.1)
WARNING: There was an error checking the latest version of pip.
import pandas as pd
import datetime
from datetime import date,timedelta
import plotly.graph_objects as go # Slicer
import plotly.express as px #Visualization
import plotly.io as pio #template
pio.templates.default="plotly_white"
#1) Read the data and display the first 100 rows from the data
data=pd.read_excel("twtr.xlsx")
print(data.head(100))
Date Open High Low Close Adj Close \
0 2013-11-07 45.099998 50.090000 44.000000 44.900002 44.900002
1 2013-11-08 45.930000 46.939999 40.685001 41.650002 41.650002
2 2013-11-11 40.500000 43.000000 39.400002 42.900002 42.900002
3 2013-11-12 43.660000 43.779999 41.830002 41.900002 41.900002
4 2013-11-13 41.029999 42.869999 40.759998 42.599998 42.599998
.. ... ... ... ... ... ...
95 2014-03-27 45.090000 46.400002 43.310001 46.320000 46.320000
96 2014-03-28 46.650002 47.340000 45.700001 47.299999 47.299999
97 2014-03-31 47.549999 47.750000 46.430000 46.669998 46.669998
98 2014-04-01 46.709999 47.590000 46.180000 46.980000 46.980000
99 2014-04-02 47.400002 47.439999 45.509998 45.730000 45.730000
Volume
0 117701670.0
1 27925307.0
2 16113941.0
3 6316755.0
4 8688325.0
.. ...
95 15507597.0
96 9610491.0
97 5794497.0
98 6916147.0
99 7911260.0
[100 rows x 7 columns]
#Give the column insights
print(data.info())
<class 'pandas.core.frame.DataFrame'> RangeIndex: 2264 entries, 0 to 2263 Data columns (total 7 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Date 2264 non-null datetime64[ns] 1 Open 2259 non-null float64 2 High 2259 non-null float64 3 Low 2259 non-null float64 4 Close 2259 non-null float64 5 Adj Close 2259 non-null float64 6 Volume 2259 non-null float64 dtypes: datetime64[ns](1), float64(6) memory usage: 123.9 KB None
# 3) Check whether this dataset contains any null values or not
print(data.isnull().sum())
Date 0 Open 5 High 5 Low 5 Close 5 Adj Close 5 Volume 5 dtype: int64
# if it is there then remove the null values from it
data=data.dropna()
print(data.head(100))
Date Open High Low Close Adj Close \
0 2013-11-07 45.099998 50.090000 44.000000 44.900002 44.900002
1 2013-11-08 45.930000 46.939999 40.685001 41.650002 41.650002
2 2013-11-11 40.500000 43.000000 39.400002 42.900002 42.900002
3 2013-11-12 43.660000 43.779999 41.830002 41.900002 41.900002
4 2013-11-13 41.029999 42.869999 40.759998 42.599998 42.599998
.. ... ... ... ... ... ...
95 2014-03-27 45.090000 46.400002 43.310001 46.320000 46.320000
96 2014-03-28 46.650002 47.340000 45.700001 47.299999 47.299999
97 2014-03-31 47.549999 47.750000 46.430000 46.669998 46.669998
98 2014-04-01 46.709999 47.590000 46.180000 46.980000 46.980000
99 2014-04-02 47.400002 47.439999 45.509998 45.730000 45.730000
Volume
0 117701670.0
1 27925307.0
2 16113941.0
3 6316755.0
4 8688325.0
.. ...
95 15507597.0
96 9610491.0
97 5794497.0
98 6916147.0
99 7911260.0
[100 rows x 7 columns]
# 4) Find the statistical description of the data.
print(data.describe())
Open High Low Close Adj Close \
count 2259.000000 2259.000000 2259.000000 2259.000000 2259.000000
mean 36.020286 36.699881 35.339465 36.003625 36.003625
std 14.118463 14.372057 13.828724 14.089989 14.089989
min 13.950000 14.220000 13.725000 14.010000 14.010000
25% 25.550000 26.215001 24.912501 25.410000 25.410000
50% 35.419998 36.099998 34.820000 35.490002 35.490002
75% 44.205000 45.015000 43.327501 44.135000 44.135000
max 78.360001 80.750000 76.050003 77.629997 77.629997
Volume
count 2.259000e+03
mean 2.175186e+07
std 1.909988e+07
min 0.000000e+00
25% 1.233530e+07
50% 1.691305e+07
75% 2.428082e+07
max 2.692131e+08
# 4) Find the statistical description of the data.
print(data.isnull())
Date Open High Low Close Adj Close Volume 0 False False False False False False False 1 False False False False False False False 2 False False False False False False False 3 False False False False False False False 4 False False False False False False False ... ... ... ... ... ... ... ... 2254 False False False False False False False 2255 False False False False False False False 2256 False False False False False False False 2257 False False False False False False False 2258 False False False False False False False [2259 rows x 7 columns]
# 6) Give me the Z-test O/R T-test over High, low, and close columns and see if the null
# hypothesis gets rejected or accepted
# program for z test (for High)
import statistics as st
from statsmodels.stats import weightstats as stest
from numpy import random
high=data['High']
print(high)
high_mean = st.mean(high)
print('Mean Data = ',high_mean)
high_stdv = st.stdev(high)
print('Standard daviation = ',high_stdv)
ztest,pval = stest.ztest(high,value=50)
print('Z test score ',ztest)
print('p value ',pval)
if pval<0.05:
print('Reject Null hypothesis')
else:
print('Accept Null hypothesis')
0 50.090000
1 46.939999
2 43.000000
3 43.779999
4 42.869999
...
2254 50.750000
2255 51.860001
2256 53.180000
2257 53.500000
2258 54.000000
Name: High, Length: 2259, dtype: float64
Mean Data = 36.69988069278442
Standard daviation = 14.372056692309659
Z test score -43.984000817502285
p value 0.0
Reject Null hypothesis
# program for z test (for Low)
low=data['Low']
low_mean=st.mean(low)
print('Mean = ',low_mean)
low_stdv=st.stdev(low)
print('Standard Daviation = ',low_stdv)
ztest,pval=stest.ztest(low,value=30)
print('Z test score = ',ztest)
print('p-value = ',pval)
if pval<0.05:
print('Reject null hypothesis')
else:
print('Accept null hypothesis')
Mean = 35.339464800354136 Standard Daviation = 13.828723572649182 Z test score = 18.35159210972168 p-value = 3.205795428365976e-75 Reject null hypothesis
# program for z test (for Close)
close=data['Close']
close_mean=st.mean(close)
print('Mean = ',close_mean)
close_stdv=st.stdev(close)
print('Standard Daviation ',close_stdv)
ztest,pval=stest.ztest(close,value=30)
print('Z test score = ',ztest)
print('p-value = ',pval)
if pval<0.05:
print('Reject Null hypothesis')
else:
print('accept null hypothesis')
Mean = 36.00362549048251 Standard Daviation 14.08998893401524 Z test score = 20.251679995652125 p-value = 3.433291673809823e-91 Reject Null hypothesis
# program for t test ( for High)
import statistics as st
from scipy.stats import ttest_1samp
high=data['High']
high_mean=st.mean(high)
print('Mean = ',high_mean)
high_stdv=st.stdev(high)
print('Standard Daviaion = ',high_stdv)
t_test,pval=ttest_1samp(high, 30)
print('t test score = ',t_test)
print('p-value = ',pval)
if pval<0.05:
print('reject null hypothesis')
else:
print('Accept null hypothesis')
Mean = 36.69988069278442 Standard Daviaion = 14.372056692309659 t test score = 22.15676048174423 p-value = 1.3745989635531811e-98 reject null hypothesis
# 7)By using ANOVA find the Fvalue and Pvalue from the data and see its Acceptance and rejection of the Null hypothesis.
import pandas as pd
import scipy.stats
import io
data=pd.read_excel("twtr.xlsx")
print(data.head(5))
data=data.dropna()
grp1=data['High']
grp2=data['Low']
print(grp1.head(5))
print(grp2.head(5))
F,pval =scipy.stats.f_oneway(grp1,grp2)
print('F value ' ,F)
print('p-value = ',pval)
if pval<0.05:
print('reject Null hypothesis')
else:
print('accept Null Hypothesis')
Date Open High Low Close Adj Close \
0 2013-11-07 45.099998 50.090000 44.000000 44.900002 44.900002
1 2013-11-08 45.930000 46.939999 40.685001 41.650002 41.650002
2 2013-11-11 40.500000 43.000000 39.400002 42.900002 42.900002
3 2013-11-12 43.660000 43.779999 41.830002 41.900002 41.900002
4 2013-11-13 41.029999 42.869999 40.759998 42.599998 42.599998
Volume
0 117701670.0
1 27925307.0
2 16113941.0
3 6316755.0
4 8688325.0
0 50.090000
1 46.939999
2 43.000000
3 43.779999
4 42.869999
Name: High, dtype: float64
0 44.000000
1 40.685001
2 39.400002
3 41.830002
4 40.759998
Name: Low, dtype: float64
F value 10.510084066032649
p-value = 0.0011958722702637046
reject Null hypothesis
# 8) Check if the data is dependent or independent by using the chi-square method.
from scipy import stats
datas={'High':data['High'],'Low':data['Low']}
print(datas)
chisq,pval=scipy.stats.chisquare(datas['Low'])
alpha=0.05
print('Chi-square value = ',chisq)
print('p-value = ',pval)
if pval<alpha:
print('Dependent (reject Ho)')
else:
print('Independnt (Accept Ho)')
{'High': 0 50.090000
1 46.939999
2 43.000000
3 43.779999
4 42.869999
...
2254 50.750000
2255 51.860001
2256 53.180000
2257 53.500000
2258 54.000000
Name: High, Length: 2259, dtype: float64, 'Low': 0 44.000000
1 40.685001
2 39.400002
3 41.830002
4 40.759998
...
2254 49.549999
2255 50.520000
2256 52.200001
2257 52.770000
2258 53.700001
Name: Low, Length: 2259, dtype: float64}
Chi-square value = 12218.788864355833
p-value = 0.0
Dependent (reject Ho)
# Outliers
# find the anomalies or any
# outliers in the data or not.
from scipy import stats
import pandas as pd
import numpy as np
mu,sigma=100,5
array=np.random.normal(mu,sigma,200)
array[90] = 180
array[50] = -40
df=pd.DataFrame(array,columns=['Data'])
print(df)
z=np.abs(stats.zscore(df))
print(z)
print('No of Outliers= ',df[z>3].count())
print('Outliers are: ',df[(z>3)|(z<3)])
df_new=df[(z>-3)&(z<3)]
print(df_new)
print(df.shape)
print(df_new.shape)
Data
0 98.907696
1 99.774472
2 101.983482
3 99.685256
4 105.027707
.. ...
195 107.481660
196 106.674779
197 87.443957
198 92.601350
199 93.575813
[200 rows x 1 columns]
Data
0 0.050156
1 0.019840
2 0.198228
3 0.012636
4 0.444063
.. ...
195 0.642230
196 0.577071
197 0.975904
198 0.559422
199 0.480729
[200 rows x 1 columns]
No of Outliers= Data 2
dtype: int64
Outliers are: Data
0 98.907696
1 99.774472
2 101.983482
3 99.685256
4 105.027707
.. ...
195 107.481660
196 106.674779
197 87.443957
198 92.601350
199 93.575813
[200 rows x 1 columns]
Data
0 98.907696
1 99.774472
2 101.983482
3 99.685256
4 105.027707
.. ...
195 107.481660
196 106.674779
197 87.443957
198 92.601350
199 93.575813
[200 rows x 1 columns]
(200, 1)
(200, 1)
data=pd.read_excel('twtr.xlsx')
print(data.isnull().sum())
Date 0 Open 5 High 5 Low 5 Close 5 Adj Close 5 Volume 5 dtype: int64
data=data.dropna()
print(data.isnull().sum())
Date 0 Open 0 High 0 Low 0 Close 0 Adj Close 0 Volume 0 dtype: int64
print(data.head(50))
Date Open High Low Close Adj Close \
0 2013-11-07 45.099998 50.090000 44.000000 44.900002 44.900002
1 2013-11-08 45.930000 46.939999 40.685001 41.650002 41.650002
2 2013-11-11 40.500000 43.000000 39.400002 42.900002 42.900002
3 2013-11-12 43.660000 43.779999 41.830002 41.900002 41.900002
4 2013-11-13 41.029999 42.869999 40.759998 42.599998 42.599998
5 2013-11-14 42.340000 45.669998 42.240002 44.689999 44.689999
6 2013-11-15 45.250000 45.270000 43.430000 43.980000 43.980000
7 2013-11-18 43.500000 43.950001 40.849998 41.139999 41.139999
8 2013-11-19 41.389999 41.900002 40.000000 41.750000 41.750000
9 2013-11-20 41.400002 41.750000 40.509998 41.049999 41.049999
10 2013-11-21 41.250000 42.490002 40.369999 42.060001 42.060001
11 2013-11-22 41.810001 42.279999 40.970001 41.000000 41.000000
12 2013-11-25 41.080002 41.139999 38.799999 39.060001 39.060001
13 2013-11-26 39.160000 40.549999 38.919998 40.180000 40.180000
14 2013-11-27 40.470001 41.400002 40.349998 40.900002 40.900002
15 2013-11-29 41.400002 41.580002 40.900002 41.570000 41.570000
16 2013-12-02 41.790001 42.000000 40.400002 40.779999 40.779999
17 2013-12-03 40.689999 41.599998 40.540001 41.369999 41.369999
18 2013-12-04 41.270000 43.919998 41.270000 43.689999 43.689999
19 2013-12-05 43.450001 46.349998 42.830002 45.619999 45.619999
20 2013-12-06 45.750000 45.799999 44.540001 44.950001 44.950001
21 2013-12-09 45.590000 49.840000 45.020000 49.139999 49.139999
22 2013-12-10 48.900002 52.580002 48.700001 51.990002 51.990002
23 2013-12-11 52.400002 53.869999 51.000000 52.340000 52.340000
24 2013-12-12 52.200001 55.869999 50.689999 55.330002 55.330002
25 2013-12-13 56.200001 59.410000 55.450001 59.000000 59.000000
26 2013-12-16 57.860001 60.240002 55.759998 56.610001 56.610001
27 2013-12-17 56.970001 57.380001 54.619999 56.450001 56.450001
28 2013-12-18 57.000000 57.000000 54.230000 55.509998 55.509998
29 2013-12-19 55.080002 57.750000 55.000000 57.490002 57.490002
30 2013-12-20 58.509998 60.250000 58.009998 60.009998 60.009998
31 2013-12-23 59.849998 64.989998 59.700001 64.540001 64.540001
32 2013-12-24 66.339996 70.870003 65.559998 69.959999 69.959999
33 2013-12-26 72.879997 74.730003 69.130096 73.309998 73.309998
34 2013-12-27 70.099998 71.250000 63.689999 63.750000 63.750000
35 2013-12-30 60.270000 63.709999 58.570000 60.509998 60.509998
36 2013-12-31 62.360001 65.220001 61.650002 63.650002 63.650002
37 2014-01-02 65.000000 67.500000 64.400002 67.500000 67.500000
38 2014-01-03 69.000000 70.430000 68.431999 69.000000 69.000000
39 2014-01-06 64.830002 66.870003 63.500000 66.290001 66.290001
40 2014-01-07 67.669998 67.730003 61.389999 61.459999 61.459999
41 2014-01-08 58.709999 61.259998 57.919998 59.290001 59.290001
42 2014-01-09 59.540001 60.810001 55.590000 57.049999 57.049999
43 2014-01-10 57.500000 58.759998 55.869999 57.000000 57.000000
44 2014-01-13 59.980000 60.380001 57.293999 57.820000 57.820000
45 2014-01-14 58.880001 59.020000 57.360001 58.209999 58.209999
46 2014-01-15 59.110001 61.750000 58.320000 61.570000 61.570000
47 2014-01-16 61.450001 62.400002 60.459999 60.570000 60.570000
48 2014-01-17 63.599998 64.690002 61.590000 62.200001 62.200001
49 2014-01-21 63.330002 63.439999 61.500000 62.529999 62.529999
Volume
0 117701670.0
1 27925307.0
2 16113941.0
3 6316755.0
4 8688325.0
5 11099433.0
6 8010663.0
7 12810624.0
8 7436616.0
9 5767325.0
10 8324753.0
11 6185245.0
12 14333375.0
13 9828433.0
14 5536322.0
15 4107074.0
16 6427386.0
17 5776893.0
18 11028953.0
19 11813520.0
20 6236232.0
21 17366614.0
22 25792002.0
23 26631535.0
24 23446870.0
25 38979567.0
26 39310848.0
27 22115199.0
28 16659776.0
29 13174896.0
30 26207420.0
31 22163787.0
32 35802698.0
33 82761072.0
34 60418668.0
35 55538253.0
36 27858516.0
37 29286655.0
38 33254610.0
39 27303649.0
40 31806111.0
41 27304350.0
42 31121971.0
43 22391578.0
44 21039027.0
45 14810026.0
46 21646397.0
47 16755251.0
48 28440701.0
49 13739691.0
# 11) Show the Twitter stock prices over the years and give a conclusion.
import plotly.graph_objects as go
import pandas as pd
figure=go.Figure(data = [go.Candlestick(x=data['Date'],
open=data['Open'],
high=data['High'],
low=data['Low'],
close=data['Close'])])
#figure.update_layout(title="Twitter stock prices over the years ",xaxis_rangeslider_visible=False)
figure.show()
#12)Now compare the close vs date column for Twitter prices over the years.
import plotly.express as px
figure=px.bar(data,
x='Date',
y='Close',
color='Close' )
figure.update_xaxes(rangeslider_visible=True)
figure.show()
# 13) Visualizing the missing values With the help of a heatmap
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16,10))
sns.heatmap(data.isnull(),cbar=False,cmap="YlGnBu")
plt.show()
import pandas as pd
dt=pd.read_excel('twtr.xlsx')
import seaborn as sns
import matplotlib.pyplot as plt
plt.figure(figsize=(16,10))
sns.heatmap(dt.isnull(),cbar=True,cmap='YlGnBu')
plt.show()
# #14)Assign buttons to control time periods. Add the buttons to analyze the stock prices of Twitter in different time periods:
figure=px.bar(data,x='Date',y='Close',color='Close')
figure.update_xaxes(rangeslider_visible=True)
figure.update_layout(title='Twitter stock prices over the years',xaxis_rangeslider_visible=False)
figure.update_xaxes(
rangeselector=dict(
buttons=list([
dict(count=1,label='1m',step='month',stepmode='backward'),
dict(count=3,label='3m',step='month',stepmode='backward'),
dict(count=6,label='6m',step='month',stepmode='backward'),
dict(count=1,label='1y',step='year',stepmode='backward'),
dict(count=2,label='2y',step='year',stepmode='backward'),
dict(step='all'),
])
)
)
figure.show()
# 15) Give the complete timeline of Twitter in the stock market. (Line Graph)
data['Date']=pd.to_datetime(data['Date'],format='%Y-%m-%d')
data['Year']=data['Date'].dt.year
data['Month']=data['Date'].dt.month
fig=px.line(data,
x='Month',
y='Close',
color='Year',
title='Complete timeline pf twitter')
fig.show()
import plotly.figure_factory as ff
import numpy as np
x=np.random.rand(15,20)
fig=ff.create_dendrogram(x)
fig.update_layout({'plot_bgcolor':'white'})
fig.show()